Eleftherios Meletis
Polychronis Kostoulas
Evangelia Anyfanti
2024-04-22 & 2024-04-23
In this session we’ll see how we can estimate a probability of interest but in a Bayesian framework, i.e. using Bayes theorem.
Bayes’ theorem - P(A|B) = P(B|A)*P(A)/P(B)
Components
Where:
To estimate the posterior distribution P(theta|y) we need to:
Hemophilia is a disease that exhibits X-chromosome-linked recessive inheritance. Consider a woman who has an affected brother, which implies that her mother must be a carrier of the hemophilia gene. We are also told that her father is not affected. Suppose she has two sons, neither of whom is affected.
y out of n individuals test positive. Estimate the apparent prevalence in a Bayesian framework.
Parameter of interest: ap - [0,1]
Data: n tested, y positive
Prior distribution for ap: ap ~ Beta(a,b)
Likelihood: y ~ Binomial(n,ap)
ap_model <-
'model {
# Define likelihood distribution of the data
# JAGS Binomial distribution Arguments: p, n
y1 ~ dbin(ap1,n1)
y2 ~ dbin(ap2,n2)
# Specify prior distribution for par of interest
# Uniform (non-informative) prior distribution
ap1 ~ dbeta(1, 100000)
ap2 ~ dbeta(1, 100000)
#data# n1, y1, n2, y2
#monitor# ap1, ap2
#inits# ap1
}
'# Initial values for par of interest
ap1 <- list(chain1=0.05, chain2=0.95)
# Run the model
results <- run.jags(ap_model, n.chains=1,
burnin=5000, sample=10000)
## Warning: Convergence cannot be assessed with only 1 chain
# View results
# Plot results
plot(results)# Print results
summary(results)
## Lower95 Median Upper95 Mean SD Mode
## ap1 1.091972e-02 0.0115451934 0.0122377064 0.0115501328 3.355818e-04 NA
## ap2 6.553641e-05 0.0001266365 0.0002000996 0.0001296519 3.512836e-05 NA
## MCerr MC%ofSD SSeff AC.10 psrf
## ap1 4.270690e-06 1.3 6174 0.001341437 NA
## ap2 4.682192e-07 1.3 5629 -0.002783184 NA
# Convert the results to 'mcmc.list' object
mcmc_ap <- as.mcmc.list(results)
# Convert 'mcmc.list' to data frame for ggplot
df_ap <- as.data.frame(as.mcmc(mcmc_ap))
library(ggplot2)
library(tidyr) # For pivot_longer
# Assuming df_ap contains the columns for ap1 and ap2
# If your data frame isn't set up this way, adjust the data extraction part accordingly.
# Convert the dataframe to long format
df_long <- pivot_longer(df_ap, cols = c(ap1, ap2), names_to = "parameter", values_to = "value")
library(ggplot2)
# Assuming df_long is already created and contains the parameters 'ap1' and 'ap2' in long format
# Generate a sample from a beta distribution
beta_data <- data.frame(value = rbeta(10000, 1, 10000))
beta_data$parameter <- "Beta Distribution"
# Combine this with your existing data
df_long <- rbind(df_long, beta_data)
# Plotting
p <- ggplot(df_long, aes(x = value, fill = parameter)) +
geom_density(alpha = 0.6, adjust = 1) + # Adjust can be tweaked for smoother curves
scale_fill_manual(values = c(ap1 = "blue", ap2 = "red", `Beta Distribution` = "green")) +
labs(title = "Overlay of Density Plots for Apparent Prevalences ap1, ap2, and Beta Distribution",
x = "Prevalence Estimate",
y = "Density") +
theme_minimal()
# Print the plot
print(p)Assuming the absence of a perfect test we do not know how many individuals are truly positive/negative.
Instead we know that n individuals are tested with an imperfect test and y have a positive result.
Apparent/True prevalence: ap/tp
Sensitivity: Se
Specificity: Sp
ap = P(T+) = P(T + & D+) + P(T+ & D-) = P(D+) * P(T+|D+) + P(D-) * P(T+|D-) =>
ap = tp * Se + (1 - tp) * (1 - Sp)
Prior distributions
Data: n tested, y positive
# Call JAGS
library(runjags)
# Provide Data
n = 4072
y = 1210
# Initial values for pars of interest
tp <- list(chain1=0.05, chain2=0.95)
Se <- list(chain1=0.05, chain2=0.95)
Sp <- list(chain1=0.05, chain2=0.95)
# Run the model
results <- run.jags(tp_model, n.chains=2,
burnin=5000, sample=10000)
## View results
# Plot results
plot(results) # Click backwards to view all plots# Print results
summary(results)
## Lower95 Median Upper95 Mean SD Mode MCerr MC%ofSD
## tp 0.2403075 0.2978380 0.3684788 0.3001641 0.03217121 NA 0.0012618569 3.9
## Se 0.7566369 0.8866616 0.9812408 0.8770171 0.06178538 NA 0.0022247881 3.6
## Sp 0.9076471 0.9524757 0.9866954 0.9497712 0.02142762 NA 0.0006325174 3.0
## SSeff AC.10 psrf
## tp 650 0.5536472 1.014044
## Se 771 0.4485606 1.002960
## Sp 1148 0.3088396 1.003026Hui-Walter paradigm/model (1980)
A particular model formulation that was originally designed for evaluating diagnostic tests in the absence of a gold standard
Not originally/necessarily Bayesian - implemented using Maximum Likelihood
Evaluating an imperfect test against another imperfect test
If we don’t know the true disease status, how can we estimate sensitivity or specificity for either test?
https://www.youtube.com/watch?v=z6devQmW2xE&ab_channel=PolychronisKostoulas#
Hui-Walter (1980) dataset Table 1
pop_1 = matrix(nrow=3,ncol=3)
rownames(pop_1) = c("Mantoux_Test_Pos", "Mantoux_Test_Neg", "Total")
colnames(pop_1) = c("Tine_Test_Pos", "Tine_Test_Neg", "Total")
pop_1[1,1] = 14
pop_1[1,2] = 4
pop_1[2,1] = 9
pop_1[2,2] = 528
#Total rows and columns
pop_1[1,3] = pop_1[1,1] + pop_1[1,2]
pop_1[2,3] = pop_1[2,1] + pop_1[2,2]
pop_1[3,1] = pop_1[1,1] + pop_1[2,1]
pop_1[3,2] = pop_1[1,2] + pop_1[2,2]
N_1 = sum(pop_1[1,1] + pop_1[1,2] + pop_1[2,1] + pop_1[2,2])
pop_1[3,3] = N_1
pop_1
## Tine_Test_Pos Tine_Test_Neg Total
## Mantoux_Test_Pos 14 4 18
## Mantoux_Test_Neg 9 528 537
## Total 23 532 555## Now let's do pop_2
pop_2 = matrix(nrow=3,ncol=3)
rownames(pop_2) = c("Mantoux_Test_Pos", "Mantoux_Test_Neg", "Total")
colnames(pop_2) = c("Tine_Test_Pos", "Tine_Test_Neg", "Total")
pop_2[1,1] = 887
pop_2[1,2] = 31
pop_2[2,1] = 37
pop_2[2,2] = 367
#Total rows and columns
pop_2[1,3] = pop_2[1,1] + pop_2[1,2]
pop_2[2,3] = pop_2[2,1] + pop_2[2,2]
pop_2[3,1] = pop_2[1,1] + pop_2[2,1]
pop_2[3,2] = pop_2[1,2] + pop_2[2,2]
N_2 = sum(pop_2[1,1] + pop_2[1,2] + pop_2[2,1] + pop_2[2,2])
pop_2[3,3] = N_2
pop_2
## Tine_Test_Pos Tine_Test_Neg Total
## Mantoux_Test_Pos 887 31 918
## Mantoux_Test_Neg 37 367 404
## Total 924 398 1322A particular model formulation that was originally designed for evaluating diagnostic tests in the absence of a gold standard
Also known as the two_test - two_population setting/paradigm
hw_definition <- c("model{
Population_1 ~ dmulti(prob_1, N_1)
Population_2 ~ dmulti(prob_2, N_2)
#Population_1
# Test1+ Test2+
prob_1[1] <- (prev[1] * ((se[1])*(se[2]))) + ((1-prev[1]) * ((1-sp[1])*(1-sp[2])))
# Test1+ Test2-
prob_1[2] <- (prev[1] * ((se[1])*(1-se[2]))) + ((1-prev[1]) * ((1-sp[1])*(sp[2])))
# Test1- Test2+
prob_1[3] <- (prev[1] * ((1-se[1])*(se[2]))) + ((1-prev[1]) * ((sp[1])*(1-sp[2])))
# Test1- Test2-
prob_1[4] <- (prev[1] * ((1-se[1])*(1-se[2]))) + ((1-prev[1]) * ((sp[1])*(sp[2])))
#Population_2
# Test1+ Test2+
prob_2[1] <- (prev[2] * ((se[1])*(se[2]))) + ((1-prev[2]) * ((1-sp[1])*(1-sp[2])))
# Test1+ Test2-
prob_2[2] <- (prev[2] * ((se[1])*(1-se[2]))) + ((1-prev[2]) * ((1-sp[1])*(sp[2])))
# Test1- Test2+
prob_2[3] <- (prev[2] * ((1-se[1])*(se[2]))) + ((1-prev[2]) * ((sp[1])*(1-sp[2])))
# Test1- Test2-
prob_2[4] <- (prev[2] * ((1-se[1])*(1-se[2]))) + ((1-prev[2]) * ((sp[1])*(sp[2])))
prev[1] ~ dbeta(1, 1)
prev[2] ~ dbeta(1, 1)
se[1] ~ dbeta(1, 1)I(1-sp[1], )
sp[1] ~ dbeta(1, 1)
se[2] ~ dbeta(1, 1)I(1-sp[2], )
sp[2] ~ dbeta(1, 1)
#data# Population_1, Population_2, N_1, N_2
#monitor# prev, prob_1, prob_2, se, sp
#inits# prev, se, sp
}
")library('runjags')
Population_1 <- as.numeric(pop_1[1:2,1:2])
Population_2 <- as.numeric(pop_2[1:2,1:2])
prev <- list(chain1=c(0.05,0.99), chain2=c(0.95,0.05))
se <- list(chain1=c(0.5,0.99), chain2=c(0.99,0.5))
sp <- list(chain1=c(0.5,0.99), chain2=c(0.99,0.5))
results <- run.jags(hw_definition, n.chains=2)summary(results)
## Lower95 Median Upper95 Mean SD Mode
## prev[1] 0.015134996 0.028044913 0.04307840 0.028627279 0.007283481 NA
## prev[2] 0.690915330 0.715582840 0.74133190 0.715430590 0.012890610 NA
## prob_1[1] 0.014157200 0.026360418 0.04027954 0.026920260 0.006791926 NA
## prob_1[2] 0.007735796 0.017439528 0.02914599 0.018038765 0.005652170 NA
## prob_1[3] 0.002429108 0.008405407 0.01684339 0.008968083 0.003920497 NA
## prob_1[4] 0.927018653 0.946654656 0.96376544 0.946072892 0.009512237 NA
## prob_2[1] 0.643579183 0.669379993 0.69426946 0.669237567 0.012986677 NA
## prob_2[2] 0.019577493 0.028400701 0.03778528 0.028660257 0.004672813 NA
## prob_2[3] 0.015911645 0.023935304 0.03248369 0.024196034 0.004240767 NA
## prob_2[4] 0.253458655 0.277790209 0.30180763 0.277906142 0.012333790 NA
## se[1] 0.955584310 0.968754974 0.98063872 0.968429806 0.006411650 NA
## se[2] 0.951673572 0.966191131 0.98003042 0.965888714 0.007232896 NA
## sp[1] 0.970809468 0.982881713 0.99337004 0.982256434 0.005949219 NA
## sp[2] 0.983163647 0.992115396 0.99852033 0.991520346 0.004173473 NA
## MCerr MC%ofSD SSeff AC.10 psrf
## prev[1] 6.671775e-05 0.9 11918 -0.0141690806 1.0001733
## prev[2] 1.216458e-04 0.9 11229 -0.0055674909 1.0000695
## prob_1[1] 6.193725e-05 0.9 12025 -0.0148276346 1.0001792
## prob_1[2] 6.160316e-05 1.1 8418 -0.0028020366 0.9999652
## prob_1[3] 4.231125e-05 1.1 8586 0.0024832450 1.0000058
## prob_1[4] 9.451507e-05 1.0 10129 -0.0185115477 1.0000470
## prob_2[1] 1.144714e-04 0.9 12871 0.0008074865 1.0000947
## prob_2[2] 4.372839e-05 0.9 11419 -0.0020506095 1.0000674
## prob_2[3] 3.980346e-05 0.9 11351 0.0143988167 1.0000089
## prob_2[4] 1.124378e-04 0.9 12033 -0.0017992025 1.0001151
## se[1] 6.565136e-05 1.0 9538 0.0190230422 0.9999648
## se[2] 7.659965e-05 1.1 8916 -0.0020043666 1.0000175
## sp[1] 6.558031e-05 1.1 8229 -0.0026330896 0.9999660
## sp[2] 4.569984e-05 1.1 8340 0.0043092761 0.9999978Run the hw_definition model under the following
different scenarios and interpret the results in each case.
Change the priors for Se[1] and Sp[1] and try Beta(5,1).
hw_definition_1 <- c("model{
Population_1 ~ dmulti(prob_1, N_1)
Population_2 ~ dmulti(prob_2, N_2)
#Population_1
# Test1+ Test2+
prob_1[1] <- (prev[1] * ((se[1])*(se[2]))) + ((1-prev[1]) * ((1-sp[1])*(1-sp[2])))
# Test1+ Test2-
prob_1[2] <- (prev[1] * ((se[1])*(1-se[2]))) + ((1-prev[1]) * ((1-sp[1])*(sp[2])))
# Test1- Test2+
prob_1[3] <- (prev[1] * ((1-se[1])*(se[2]))) + ((1-prev[1]) * ((sp[1])*(1-sp[2])))
# Test1- Test2-
prob_1[4] <- (prev[1] * ((1-se[1])*(1-se[2]))) + ((1-prev[1]) * ((sp[1])*(sp[2])))
#Population_2
# Test1+ Test2+
prob_2[1] <- (prev[2] * ((se[1])*(se[2]))) + ((1-prev[2]) * ((1-sp[1])*(1-sp[2])))
# Test1+ Test2-
prob_2[2] <- (prev[2] * ((se[1])*(1-se[2]))) + ((1-prev[2]) * ((1-sp[1])*(sp[2])))
# Test1- Test2+
prob_2[3] <- (prev[2] * ((1-se[1])*(se[2]))) + ((1-prev[2]) * ((sp[1])*(1-sp[2])))
# Test1- Test2-
prob_2[4] <- (prev[2] * ((1-se[1])*(1-se[2]))) + ((1-prev[2]) * ((sp[1])*(sp[2])))
prev[1] ~ dbeta(1, 1)
prev[2] ~ dbeta(1, 1)
se[1] ~ dbeta(5, 1)I(1-sp[1], )
sp[1] ~ dbeta(5, 1)
se[2] ~ dbeta(1, 1)I(1-sp[2], )
sp[2] ~ dbeta(1, 1)
#data# Population_1, Population_2, N_1, N_2
#monitor# prev, prob_1, prob_2, se, sp
#inits# prev, se, sp
}
")
Population_1 <- as.numeric(pop_1[1:2,1:2])
Population_2 <- as.numeric(pop_2[1:2,1:2])
prev <- list(chain1=c(0.05,0.99), chain2=c(0.95,0.05))
se <- list(chain1=c(0.5,0.99), chain2=c(0.99,0.5))
sp <- list(chain1=c(0.5,0.99), chain2=c(0.99,0.5))
results_1 <- run.jags(hw_definition_1, n.chains=2)
# Remember to check convergence and effective sample size!
plot(results_1) # Click backwards to view all plotssummary(results_1)
## Lower95 Median Upper95 Mean SD Mode
## prev[1] 0.014826923 0.028134541 0.04306610 0.028749402 0.007438955 NA
## prev[2] 0.689752755 0.715504409 0.74021997 0.715359756 0.012947617 NA
## prob_1[1] 0.014773174 0.026470000 0.04116364 0.027035759 0.006934728 NA
## prob_1[2] 0.007476164 0.017319275 0.02904750 0.017931072 0.005663263 NA
## prob_1[3] 0.002048212 0.008427675 0.01675232 0.009013356 0.004000690 NA
## prob_1[4] 0.927204204 0.946435070 0.96494019 0.946019814 0.009646584 NA
## prob_2[1] 0.643645747 0.669390102 0.69435104 0.669222470 0.012932065 NA
## prob_2[2] 0.020076707 0.028527605 0.03795316 0.028734909 0.004622913 NA
## prob_2[3] 0.016287730 0.023785364 0.03256279 0.024051480 0.004207159 NA
## prob_2[4] 0.253883200 0.277833017 0.30226045 0.277991141 0.012376384 NA
## se[1] 0.955465800 0.969014218 0.98037901 0.968654657 0.006363729 NA
## se[2] 0.951839701 0.965934467 0.97960843 0.965740429 0.007153362 NA
## sp[1] 0.970736268 0.983012684 0.99345855 0.982375180 0.005961492 NA
## sp[2] 0.983262065 0.992083439 0.99895634 0.991469690 0.004255582 NA
## MCerr MC%ofSD SSeff AC.10 psrf
## prev[1] 7.102587e-05 1.0 10970 0.0010270420 1.0000487
## prev[2] 1.220861e-04 0.9 11247 0.0081960861 1.0000494
## prob_1[1] 6.592711e-05 1.0 11064 0.0003184868 1.0000450
## prob_1[2] 6.215152e-05 1.1 8303 0.0154585463 0.9999592
## prob_1[3] 4.446067e-05 1.1 8097 -0.0029011841 1.0000610
## prob_1[4] 9.457490e-05 1.0 10404 -0.0097024767 1.0002971
## prob_2[1] 1.124436e-04 0.9 13227 -0.0012019476 1.0004216
## prob_2[2] 4.219356e-05 0.9 12004 0.0099645710 1.0003116
## prob_2[3] 4.173170e-05 1.0 10164 -0.0057853883 1.0004645
## prob_2[4] 1.117283e-04 0.9 12270 0.0046014984 1.0000558
## se[1] 6.870037e-05 1.1 8580 -0.0066740779 1.0005514
## se[2] 7.433448e-05 1.0 9261 0.0094238536 1.0002994
## sp[1] 6.615695e-05 1.1 8120 0.0153886562 0.9999518
## sp[2] 4.798811e-05 1.1 7864 -0.0019992100 1.0001384I(1-sp[1], ) and I(1-sp[2],)
from the model and run it again. What happens now?hw_definition_2 <- c("model{
Population_1 ~ dmulti(prob_1, N_1)
Population_2 ~ dmulti(prob_2, N_2)
#Population_1
# Test1+ Test2+
prob_1[1] <- (prev[1] * ((se[1])*(se[2]))) + ((1-prev[1]) * ((1-sp[1])*(1-sp[2])))
# Test1+ Test2-
prob_1[2] <- (prev[1] * ((se[1])*(1-se[2]))) + ((1-prev[1]) * ((1-sp[1])*(sp[2])))
# Test1- Test2+
prob_1[3] <- (prev[1] * ((1-se[1])*(se[2]))) + ((1-prev[1]) * ((sp[1])*(1-sp[2])))
# Test1- Test2-
prob_1[4] <- (prev[1] * ((1-se[1])*(1-se[2]))) + ((1-prev[1]) * ((sp[1])*(sp[2])))
#Population_2
# Test1+ Test2+
prob_2[1] <- (prev[2] * ((se[1])*(se[2]))) + ((1-prev[2]) * ((1-sp[1])*(1-sp[2])))
# Test1+ Test2-
prob_2[2] <- (prev[2] * ((se[1])*(1-se[2]))) + ((1-prev[2]) * ((1-sp[1])*(sp[2])))
# Test1- Test2+
prob_2[3] <- (prev[2] * ((1-se[1])*(se[2]))) + ((1-prev[2]) * ((sp[1])*(1-sp[2])))
# Test1- Test2-
prob_2[4] <- (prev[2] * ((1-se[1])*(1-se[2]))) + ((1-prev[2]) * ((sp[1])*(sp[2])))
prev[1] ~ dbeta(1, 1)
prev[2] ~ dbeta(1, 1)
se[1] ~ dbeta(1, 1)
sp[1] ~ dbeta(1, 1)
se[2] ~ dbeta(1, 1)
sp[2] ~ dbeta(1, 1)
#data# Population_1, Population_2, N_1, N_2
#monitor# prev, prob_1, prob_2, se, sp
#inits# prev, se, sp
}
")
Population_1 <- as.numeric(pop_1[1:2,1:2])
Population_2 <- as.numeric(pop_2[1:2,1:2])
prev <- list(chain1=c(0.05,0.99), chain2=c(0.95,0.05))
se <- list(chain1=c(0.5,0.99), chain2=c(0.99,0.5))
sp <- list(chain1=c(0.5,0.99), chain2=c(0.99,0.5))
results_2 <- run.jags(hw_definition_2, n.chains=2)
# Remember to check convergence and effective sample size!
plot(results_2) # Click backwards to view all plots